clear all
	
set type double /* added on 10sep22*/
set matsize 8000


*******使用するサブルーチン*****************************
	qui do "modules/banch.do"
************************************************************
		insheet using "../data/address_license_121818.csv", clear comma
		gen pref_id = .
		replace pref_id = 1 if v2 == "北海道"
		replace pref_id = 2 if v2 == "青森県"
		replace pref_id = 3 if v2 == "岩手県"
		replace pref_id = 4 if v2 == "宮城県"
		replace pref_id = 5 if v2 == "秋田県"
		replace pref_id = 6 if v2 == "山形県"
		replace pref_id = 7 if v2 == "福島県"
		replace pref_id = 8 if v2 == "茨城県"
		replace pref_id = 9 if v2 == "栃木県"
		replace pref_id = 10 if v2 == "群馬県"
		replace pref_id = 11 if v2 == "埼玉県"
		replace pref_id = 12 if v2 == "千葉県"
		replace pref_id = 13 if v2 == "東京都"
		replace pref_id = 14 if v2 == "神奈川県"
		replace pref_id = 15 if v2 == "新潟県"
		replace pref_id = 16 if v2 == "富山県"
		replace pref_id = 17 if v2 == "石川県"
		replace pref_id = 18 if v2 == "福井県"
		replace pref_id = 19 if v2 == "山梨県"
		replace pref_id = 20 if v2 == "長野県"
		replace pref_id = 21 if v2 == "岐阜県"
		replace pref_id = 22 if v2 == "静岡県"
		replace pref_id = 23 if v2 == "愛知県"
		replace pref_id = 24 if v2 == "三重県"
		replace pref_id = 25 if v2 == "滋賀県"
		replace pref_id = 26 if v2 == "京都府"
		replace pref_id = 27 if v2 == "大阪府"
		replace pref_id = 28 if v2 == "兵庫県"
		replace pref_id = 29 if v2 == "奈良県"
		replace pref_id = 30 if v2 == "和歌山県"
		replace pref_id = 31 if v2 == "鳥取県"
		replace pref_id = 32 if v2 == "島根県"
		replace pref_id = 33 if v2 == "岡山県"
		replace pref_id = 34 if v2 == "広島県"
		replace pref_id = 35 if v2 == "山口県"
		replace pref_id = 36 if v2 == "徳島県"
		replace pref_id = 37 if v2 == "香川県"
		replace pref_id = 38 if v2 == "愛媛県"
		replace pref_id = 39 if v2 == "高知県"
		replace pref_id = 40 if v2 == "福岡県"
		replace pref_id = 41 if v2 == "佐賀県"
		replace pref_id = 42 if v2 == "長崎県"
		replace pref_id = 43 if v2 == "熊本県"
		replace pref_id = 44 if v2 == "大分県"
		replace pref_id = 45 if v2 == "宮崎県"
		replace pref_id = 46 if v2 == "鹿児島県"
		replace pref_id = 47 if v2 == "沖縄県"

		destring v1, gen(drid) force
		drop if drid == .
		drop v1
		gen no_license = regexs(1) if regexm(v3, "第([0-9]+)号")
		destring no_license, replace
		rename v7 ad_firm
		rename v4 nm_firm
		
	*************************************************************
	* Replace \u3000 及び　半角空白　to 全角空白
	foreach x in "ad_firm" "nm_firm" {
		replace `x' = usubinstr(`x',"\u3000","　",.)
		replace `x' = usubinstr(`x'," ","　",.) /* 半角空白->全角空白*/
	}
	*連続空白除去                              *
	foreach x in "ad_firm" "nm_firm" {
		qui count if ustrregexm(`x', "\s{2,}")
		local c_initial = r(N)
		local c = `c_initial'
		while `c' > 0 {
			qui replace `x' = usubinstr(`x',"　　","　",.)
			qui count if ustrregexm(`x', "\s{2,}")
			local c = `r(N)'
		}
		di as result "`x'の連続の空白除去完了: `c_initial' changes made!"
	}
	*先頭、末尾の全角/半角空白除去; \sは半角全角双方の空白に対応するみたい	*
	foreach x in "ad_firm" "nm_firm" {
		qui count if ustrregexm(`x',"^\s+(.+)")	
		qui replace `x' = ustrregexs(1) if ustrregexm(`x',"^\s+(.+)")	
		di as result "`x'の先頭の空白除去: `r(N)' changes made!"
	}
	foreach x in "ad_firm" "nm_firm" {
		qui count if ustrregexm(`x',"(.+)\s+$")	
		qui replace `x' = ustrregexs(1) if ustrregexm(`x',"(.+)\s+$")
		di as result "`x'の末尾の空白除去: `r(N)' changes made!"
	}
	
	/* Check 半角空白,連続空白はないはず*/
	foreach x in "ad_firm" "nm_firm" {
		qui count if ustrregexm(`x', " ") 
		if r(N) > 0{
			di "`x'に半角空白が残っている！"
			stop
		}
		qui count if ustrregexm(`x', "\s{2,}")
		if r(N) > 0{
			di "`x'に半角空白が残っている！"
			stop
		}
	}
	*住所の短いハイフンは全角ハイフンに
	replace ad_firm = usubinstr(ad_firm,"‐","－",.)
	replace ad_firm = usubinstr(ad_firm,"-","－",.)
	replace ad_firm = usubinstr(ad_firm,"―","－",.) /*ハイフン*/
	replace ad_firm = usubinstr(ad_firm,"ー","－",.) /*bar*/
		
		
	replace no_license = pref_id * 1000000 + no_license if pref_id ~= .
	gen hq = (v6 == "主たる営業所")
	order drid no_license pref_id hq

	*番地 -> program banch を使って一括変換
	*gen ad_firma = ad_firm
	count if ustrregexm(ad_firm, "番地")
	banch ad_firm
	count if ustrregexm(ad_firm, "番地")
	format %70s ad_firm*
	
	
	/* check duplicates of no_lice*/
		cap drop dupp
		bys hq no_licen: gen dupp = _N if hq == 1 & pref_id ~= .
		tab dupp
		order dupp
		count if dupp == 10201
		if r(N) > 0{
			stopp duplicates
		}
		
	sum drid, d
	compress
	format %20s v5 v6 
	format %40s ad_fi 
	save ../tmp/address_license_121818.dta, replace 

	use ../tmp/address_license_121818.dta, clear
	gsort nm_firm ad_firm -hq 
	bysort nm_firm ad_firm: gen aaa = _n

	*format %40s nm* ad*
	*order aaa nm_firm* ad_firm* v*

	keep if aaa == 1
	*br if aaaN > 1
	drop aaa*
	keep nm_firm ad_firm hq no_license
	compress
	save address_license_ad_nm_121818.dta, replace

	use ../tmp/address_license_121818.dta, clear
		gsort nm_firm no_license -hq
		bysort nm_firm no_license: gen aaa = _n
		keep if aaa == 1
		drop aaa
		keep nm_firm ad_firm hq no_license
		compress
	save ../tmp/address_license_li_nm_121818.dta, replace
	
	
	erase ../tmp/address_license_121818.dta
*****************************************************

use "../tmp/tempTTT.dta", replace
keep rid nm_firm ad_firm no_license fy_con region date2005_ct 
order no_license

/* 許可番号が欠損しているデータに許可番号を付与(建設業許可番号サイトからのデータ）*/
	*keep if no_license == . /*| ad_firm == ""*/
	rename no_license no_license2	
	*gen hq = 1
	merge m:1 nm_firm ad_firm using address_license_ad_nm_121818.dta
	drop if _merge == 2
	replace no_license2 = no_license if no_license2 == .
	drop no_license
	rename no_license2 no_license
	drop _merge
	erase address_license_ad_nm_121818.dta

/* 住所が欠損しているデータに住所を付与*/
	rename ad_firm ad_firm2
	merge m:1 nm_firm no_license using ../tmp/address_license_li_nm_121818.dta
	drop if _merge == 2
	replace ad_firm2 = ad_firm if ad_firm2 == ""
	drop ad_firm
	rename ad_firm2 ad_firm
	erase ../tmp/address_license_li_nm_121818.dta

/* 同一住所に複数の許可番号:一つの許可番号に統一し、住所が入っていないものについてもそれを適用する*/
	/*該当データの参照*/
		cap drop tag_nmadli
		egen tag_nmadli = tag(nm_firm ad_firm no_license)
		bys nm_firm ad_firm no_license: gen tag_nmadli_N = _N 
		bys nm_firm ad_firm: egen c_tag_nmadli = total(tag_nmadli)
		tab c_tag_nmad
		sort nm_firm ad_firm no_license
		order fy_con tag* c_tag_nmad region nm_firm ad_firm no_li*
		format %40s nm_firm
		format %50s ad_firm
		/*br if c_tag_nm == 0 & fy_con > 2012 & fy_con < 2018 ///
			& regexm(nm_xlfile, ".+g\.xls.*") == 0
		br if c_tag_nm > 1 & fy_con > 2012 & fy_con < 2018 ///
			& regexm(nm_xlfile, ".+g\.xls.*") == 0 & tag_nmadli == 1*/
		*br if no_license < . & ad_firm == ""
		
	/* 許可番号があり、住所が一部空白の業者の空白住所を埋める*/
		gen ad_firm_original = ad_firm
		*replace ad_firm = ad_firm_original
		gsort nm_firm no_license -ad_firm
		by nm_firm no_license: replace ad_firm = ad_firm[_n-1] ///
			if ad_firm[_n-1] ~= "" & ad_firm == "" & no_license < . & _n > 1

		drop hq
		drop _merge
		compress
		
/* firmIDを作成*/
	/*Step 1:業者名・住所でIDを作る(空白住所それ自体を住所とみなす)*/
		replace ad_firm = "a" if ad_firm == ""　/*空白住所でもIDを付与するため*/
		egen fnmadID0 = group(nm_firm ad_firm)
		replace ad_firm = "" if ad_firm == "a"
		
	/* Step2: ID一つに複数ライセンス番号の業者を対象にIDの統一をする*/
		* 一IDに複数ライセンス番号業者のIDを選ぶ（その他の業者は.)
		cap drop fnmadID max_li min_li
		bys nm_firm ad_firm: egen max_license = max(no_license)
		bys nm_firm ad_firm: egen min_license = min(no_license)
		gen fnmadID = fnmadID0 if max_li ~= min_li
	
		/* その業者名と許可番号があればmissingのIDを埋める（A） *例)br if nm_firm == "弘安建設（株）"*/
		sort nm_firm no_license fnmadID	/* Note: sort,gsortは、missingが常に一番下に来る*/
		by nm_firm no_license: replace fnmadID = fnmadID[_n - 1] ///
												if _n > 1 & no_license < .
		/*日本橋梁のように、2つ以上の許可番号、それぞれの許可番号に2以上の住所があると、
			IDは許可番号ごとに異なるものになっている。下の"（B）"を回すことで解決させる。*/
		*format %40s nm* ad*
		*order fnmadID* max* min* no_license nm_firm ad_firm 	
			
	/* Step3: 住所と企業名が同じなら、同一IDにする（maxで） */
	bys nm_firm ad_firm: egen fnmadID2 = max(fnmadID)
	order fnmadID2
	
	/* ID一つに一つのライセンス番号の業者について、fnmadID2を埋める（fnmadIDから）*/
	replace fnmadID2 = fnmadID0 if fnmadID2 == .
	
	/* 住所もライセンス番号も記録されていないものは、fnmadID2をmissing にする*/
	replace fnmadID2 = . if ad_firm == "" & no_license == .		

	/* 業者名と許可番号が同一ならば、IDを一つに統一する（B）*/
		/* 例) br if nm_firm == "あおみ建設（株）" & (no_license < . | ad_firm ~= "")
		一つの許可番号：2245に3種類の住所.max_li = min_liなので、fnmadID2は3種類、fnmadID0と同一.これをmaxをとって統一する*/
	cap drop vvv
	bys nm_firm no_license: egen vvv = max(fnmadID2) if no_license < .
	count if vvv ~= fnmadID2 
	*order vvv
	/* vvvは住所のみ、許可番号無しはmissingになっている*/
	/* fnmadID2 は住所のみでも記録されている*/
		egen fnmadID3 = rowmax(fnmadID2 vvv)
		count if fnmadID2 ~= fnmadID0 & (no_license < . | ad_firm ~= "")
		count if fnmadID2 ~= fnmadID3 & (no_license < . | ad_firm ~= "")
		*br if fnmadID2 ~= fnmadID3
		drop vvv
		order fnmadID*
	
	/* 業者名住所が同一なら、fnmadID3を一つに統一
	例） 
	br if nm_firm == "アイサワ工業（株）" & (no_license < . | ad_firm ~= "")
		fnmadID3の業者名住所ごとのmaxをとって統一する*/
	bys nm_firm ad_firm: egen fnmadID4 = max(fnmadID3)
	
/* Verify*/
	/*業者名住所同一ならfnmadID4は一種類なはず*/
	cap drop hhh*
	egen hhh = tag(nm_firm ad_firm fnmadID4)
	bys nm_firm ad_firm: egen hhh2 = total(hhh)
	tab hhh2

	/*業者名許可番号同一ならfnmadID4は一種類なはず*/
	cap drop hhh*
	egen hhh = tag(nm_firm no_license fnmadID4)
	bys nm_firm no_license: egen hhh2 = total(hhh)
	tab hhh2
	
	count if no_license < . | ad_firm ~= ""
	sum fnmadID* if no_license < . | ad_firm ~= ""
	sum fnmadID4 fnmadID*
	
	order fy_con region fnmadID* no_license nm_firm ad_firm*
	sort region nm_firm fnmadID3

	/*br if ustrregexm(ad_firm, "丁目")
	br if ustrregexm(ad_firm, "番地")
	br if ustrregexm(nm_firm, "荒井建設（株）")*/
	
	/*sort nm_firm ad_firm no_license
	br if nm_firm == "（株）安部日鋼工業" & (no_license < . | ad_firm ~= "")
	br if nm_firm == "弘安建設（株）" & (no_license < . | ad_firm ~= "")
	br if nm_firm == "志田建設（株）" & (no_license < . | ad_firm ~= "")
	br if nm_firm == "あおみ建設（株）" & (no_license < . | ad_firm ~= "")
	br if nm_firm == "西田建設（株）"& (no_license < . | ad_firm ~= "")
	br if nm_firm == "日本橋梁（株）" & (no_license < . | ad_firm ~= "")
	br if nm_firm == "ユウテック（株）" & (no_license < . | ad_firm ~= "")
		br if nm_firm == "（株）安部日鋼工業"
	*/
	*drop n_nm_firm dd FLAG_decimal v2-_merge 
	*drop rakusatsu prob_ad del
	keep rid fnmadID4 ad_firm no_license nm_firm
	rename fnmadID4 firmID
	rename nm_firm nm_firm_new
	rename ad_firm ad_firm_new
	rename no_license no_license_new
	drop if firmID == . & no_license_new == . & ad_firm_new == ""
	
	/* pref city */
		cap drop pref
		gen pref = ustrregexs(1) if ustrregexm(ad_firm, "(^東京都|北海道|京都府|大阪府|.+[^市]県)")
		*tab pref
	/* remove xx郡 from muni */
		cap drop city
		*tab muni if regexm(muni, "^.+郡") == 1 & regexm(muni, "蒲郡|小郡|大和郡山|上郡") == 0
		gen city　= subinstr(ad_firm, pref,"",.)
		replace city = ustrregexrf(city, "吉敷郡|海上郡|添上郡|高市郡","")
		replace city = ustrregexs(1) if ustrregexm(city, "^([^市町村区]+市)")
		replace city = ustrregexs(1) if ustrregexm(city, "^([^市町村区]+町)")
		replace city = ustrregexs(1) if ustrregexm(city, "^([^市町村区]+村)")
		replace city = ustrregexs(1) if ustrregexm(city, "^([^市町村区]+区)")
		cap drop aaa
		*gen aaa = city
		replace city = ustrregexrf(city, "^.+郡","") if ustrregexm(city, "蒲郡|小郡|大和郡山|上郡") == 0
		replace city = ustrregexrf(city, "吉敷郡|海上郡|添上郡|高市郡","")
		*tab city if ustrregexm(city, "郡") == 1
		*br ad_firm pref city aaa if city == "高市"
		
		format %70s ad_*
		format %30s pref city
		*br if ad_firm ~= "" & pref == ""
		*br if ad_firm ~= "" & city == ""
	compress
		
	*cd ../do
*egen firmID = group(region nm_firm no_license)
	save ../tmp/temp_rid_address_license_firmID.dta, replace

